/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package cn.ac.iie.di.tools;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 *
 * @author lenovo-pc
 */
public class JSoupParserHtml {

	private SimpleDateFormat formatter;

	public static String getTextFromHTML(String htmlStr) {
		Document doc = Jsoup.parse(htmlStr);
		String text = doc.text();
		// remove extra white space
		StringBuilder builder = new StringBuilder(text);
		int index = 0;
		while (builder.length() > index) {
			char tmp = builder.charAt(index);
			if (Character.isSpaceChar(tmp) || Character.isWhitespace(tmp)) {
				builder.setCharAt(index, ' ');
			}
			index++;
		}
		text = builder.toString().replaceAll(" +", " ").trim();
		return text;
	}

	public static ArrayList<String> getPicSrcFromHTML(String htmlStr) {
		Document doc = Jsoup.parse(htmlStr);
		ArrayList<String> picurls = new ArrayList<>();
		Elements links = doc.getElementsByTag("img");

		for (Element link : links) {
			if (link.attr("data-src") == null || link.attr("data-src").equals("")) {
				continue;
			} else {
				//String picSrcHref = link.attr("data-src").substring(2, link.attr("data-src").length() - 2);
                                String picSrcHref = link.attr("data-src").replace("\\\\\\\"", "");
                                picSrcHref = picSrcHref.replace("\\\"", "");
                                picSrcHref = picSrcHref.replace("\"", "");
				picurls.add(picSrcHref);
			}
		}
		return picurls;

	}

	public static long getImsgPt(String ss) throws ParseException {
		long pt = -1;
		Document doc = Jsoup.parse(ss);
		String str_pt;
		str_pt = doc.getElementsByTag("em").text();
//		System.out.println(str_pt);
		SimpleDateFormat formatter=new SimpleDateFormat("yyyy-MM-dd");
		try {
			pt=formatter.parse(str_pt).getTime()/1000;
		} catch (Exception e) {
			return 0;
		}
//		System.out.println(formatter.parse(str_pt).getTime());
//		System.out.println(formatter.parse(str_pt).getTime()/1000);
//		System.out.println(formatter.parse("2016-10-19").getTime()/1000);
		return pt;
	}

	public static void main(String[] args) throws ParseException {
            System.out.println(getTextFromHTML("每日分享热门音乐视频，经典音乐好歌，更多精彩视频，敬请关注！\\n\\n♪ 点击查看<a href=\\\"http://dwz.cn/6Am62g\\\">→要你成功</a>\\n\\n♪ 点击查看<a href=\\\"http://dwz.cn/6zAfj7\\\">→点点看</a>\\n\\n♪ 点击查看<a href=\\\"http://dwz.cn/6zAfsM\\\">→睡前一段话</a>\\n\\n♪ 点击查看<a href=\\\"http://dwz.cn/6zAieQ\\\">→清晨阅读</a>\\n\\n♪ 点击查看<a href=\\\"http://dwz.cn/6Ctk1h\\\">→音乐相册</a>\\n\\n♪ 点击查看<a href=\\\"http://dwz.cn/6zBtvx\\\">→早安有书</a>\\n\\n♪ 点击查看<a href=\\\"http://dwz.cn/6yeHBG\\\">→精彩阅读</a>\\n\\n♪ 点击查看<a href=\\\"http://dwz.cn/6CRBJT\\\">→热门视频</a>\\n\\n♪ 点击查看<a href=\\\"http://dwz.cn/6CRCEs\\\">→广场舞</a>\\n\\n♪ 点击查看<a href=\\\"http://dwz.cn/6zADBB\\\">→学习短视频</a>"));
            
//		List<String> list = new ArrayList<String>();
//		try {
//			//FileReader read = new FileReader("D:\\abcd\\55555.txt");
//			//FileReader read = new FileReader("D:\\44.txt");
//			BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream("D:\\abcd\\55555.txt"), "UTF-8"));
//			String row = "";
//			while ((row = br.readLine()) != null) {
//				list.add(row);
//			}
//		} catch (FileNotFoundException e) {
//
//		} catch (IOException e) {
//
//		}
//
//		System.out.println(getTextFromHTML(list.get(0)));
//		System.out.println(getPicSrcFromHTML(list.get(0)));
//		System.out.println(getImsgPt(list.get(0)));
	}

}
